Notebook 8: Metagenomes Functional Profiles

Kevin Bonham, PhD

Now we've been through some analyses of the taxonomic profiles, but metagenomes also allow the generation of functional profiles; that is, what genes are present in a given community.

These profiles were generated with HUMANn2.

There are a few different ways to classify genes, some of which capture more of the total sequence space than others. For example, UniProt's UniRef90 groupings capture basically everything that's ever been sequenced before, but many of those labels have no actual information associated with them. By contrast, InterPro's PFam and Kegg Orthology (KO) databases are much smaller, but each annotation has some information associated with it.

This also means that the uniref90 files are much larger.

uniref_path = "data/engaging/merged/batch1-10_genefamilies_relab.tsv"
pfam_path = "data/engaging/merged/batch1-10_pfam_names_relab.tsv"
ko_path = "data/engaging/merged/batch1-10_ko_names_relab.tsv"

# UniRef90s Gb
filesize(uniref_path) / 1e9
9.293069462
# Pfam Gb
filesize(pfam_path) / 1e9
0.787214265
# KEGG Orthology Gb
filesize(ko_path) / 1e9
0.295653706
filesamples = samples_from_file(uniref_path)
filesamples[1:5]
5-element Array{NamedTuple{(:sample, :subject, :timepoint),Tuple{String,Int64,Int64}},1}:
 (sample = "C0005_3F_1A", subject = 5, timepoint = 3) 
 (sample = "C0016_3E_1A", subject = 16, timepoint = 3)
 (sample = "C0016_3F_1A", subject = 16, timepoint = 3)
 (sample = "C0016_3F_2A", subject = 16, timepoint = 3)
 (sample = "C0016_4F_1A", subject = 16, timepoint = 4)

Each of these tables is taxonomically stratified - that is, each gene group is also split by the organism that contributed it. For now, we don't need that information, and the unstratified tables are much smaller.

uniref_unstrat = "data/engaging/merged/batch1-10_genefamilies_relab_unstratified.tsv"
@assert samples_from_file(uniref_unstrat) == filesamples
filesize(uniref_unstrat) / 1e9
3.971306867

Even still, this file is so large that the typical CSV parsers have issues dealing with it. Since I know the first column is strings (the features) and every other column is floats, I can just do this manually.

#Get unique samples for each subject and subset by type
kidallsamples = uniquesamples(filesamples, samplefilter=s-> startswith(s, "C"))
momallsamples = uniquesamples(filesamples, samplefilter=s-> startswith(s, "M"))
kidallmeta = load_metadata(datatoml, samples=kidallsamples)
momallmeta = load_metadata(datatoml, samples=momallsamples)
function filter_tsv(filename, keep)
    colnames = Symbol.(split(first(eachline(filename)), '\t'))[keep]

    df = DataFrame(colnames[1]=>String[], (c=>Float64[] for c in colnames[2:end])...)
    for row in CSV.Rows(filename)
        row = row[keep]
        rownums = (parse(Float64, row[i]) for i in 2:length(row))
        all(isequal(0.), rownums) && continue
        push!(df, (row[1], rownums...))
    end
    return df
end


keepers = Set([kidallsamples; momallsamples])
keep = [true, map(s-> in(s, keepers), filesamples)...]

function normalize_colname(col)
    c = String(col)
    c = replace(c, r"_S\d{1,2}.*$"=>"")
    c = replace(c, "-"=>"_")
    return Symbol(c)
end

unirefs = filter_tsv(uniref_unstrat, keep)
names!(unirefs, [:uniref, (normalize_colname(col) for col in names(unirefs)[2:end])...])
permutecols!(unirefs, [:uniref, (Symbol(getfield(s, :sample)) for s in [kidallsamples; momallsamples])...])

# some checks to make sure different gene types didn't make it through
@assert !any(row-> occursin("GeneID", row[1]), eachrow(unirefs))
@assert !any(row-> occursin("UniRef50", row[1]), eachrow(unirefs))
unirefs = abundancetable(unirefs)
uniref_meta = load_metadata(datatoml, samples=[kidallsamples; momallsamples])

moms = view(unirefs, sites=getfield.(momallsamples, :sample))
moms = view(moms, species= map(row-> !all(isequal(0.), row), eachrow(occurrences(moms))))

dropmissing!(kidallmeta, :correctedAgeDays)
kids = view(unirefs, sites = map(s-> s in kidallmeta.sample, sitenames(unirefs)))
kids = view(kids, species= map(row-> !all(isequal(0.), row), eachrow(occurrences(kids))))

Because the microbial taxonomic diversity increases with age, we also expect the gene diversity to increase. We can just look at the total number of genes by age:

# get matrix of gene abundances
occ = occurrences(kids)

occ_srt = view(occ, :, sortperm(kidallmeta.correctedAgeDays))

num_genes = map(col-> sum(x-> x > 0, col), eachcol(occ_srt))
scatter(sort(kidallmeta.correctedAgeDays), num_genes,
    legend=false, xlabel="Age (days)", ylabel="Identified UniRef90s",
    title="Genes by age", color=:lightgrey)

Going forward, we don't really care about genes that are very rare, or are present in nearly everyone. So I'll filter out genes that are present in < 5% or more than 90% of samples. And I'm reassigning the variables rather than using views just because the original table is so enormous.

Microbiome.prevalence(a, minabundance::Float64=0.0001) = mean(x-> present(x, minabundance), (y for y in a))

prevfilt = map(eachrow(occurrences(unirefs))) do row
    u1_prev = prevalence(row[map(x-> x == "1 and under", uniref_meta.ageLabel)], 0.)
    o1_prev = prevalence(row[map(x-> x != "mom" && x != "1 and under", uniref_meta.ageLabel)], 0.)
    mom_prev = prevalence(row[map(x-> x == "mom", uniref_meta.ageLabel)], 0.)

    return (any(>(0.05), [u1_prev, o1_prev, mom_prev]), all(<(0.9), [u1_prev, o1_prev, mom_prev]) )
end

prevalent = view(unirefs, species=[p[1] for p in prevfilt])
accessory = view(unirefs, species=map(all, prevfilt))

Here's our typical PCoA:

dm = pairwise(BrayCurtis(), prevalent)
mds = fit(MDS, dm, distances=true)
plot(mds, group=uniref_meta.ageLabel, color=color3',
    legend=:bottomright, title="All genes, All samples", markersize=3)
dm = pairwise(BrayCurtis(), accessory)
mds = fit(MDS, dm, distances=true)
plot(mds, group=uniref_meta.ageLabel, color=color3', markersize=3,
    legend=:bottomright, title="Accessory genes, All samples")

Until now, I've been looking at all samples. Here, let's filter down to only the first sample for each subject.

ukidsamples = uniquesamples(kidallsamples, identifiers=[:subject])
umomsamples = uniquesamples(momallsamples, identifiers=[:subject])

uprevalent = view(prevalent, sites=getfield.([ukidsamples; umomsamples], :sample))
umeta = load_metadata(datatoml, samples=[ukidsamples; umomsamples])
dm = pairwise(BrayCurtis(), uprevalent)
mds = fit(MDS, dm, distances=true)

plot(mds, group=umeta.ageLabel, color=color3',
    legend=:topright, title="All genes, first samples")
uaccessory = view(accessory, sites=getfield.([ukidsamples;umomsamples], :sample))
dm = pairwise(BrayCurtis(), uaccessory)
mds = fit(MDS, dm, distances=true)

plot(mds, group=umeta.ageLabel, color=color3',
    legend=:topright, title="Accessory genes, first samples")
ukidsmeta = load_metadata(datatoml, samples=ukidsamples)
dropmissing!(ukidsmeta, :correctedAgeDays)
umomsmeta = load_metadata(datatoml, samples=umomsamples)

umoms = view(uaccessory, sites=getfield.(umomsamples, :sample))
ukids = view(uaccessory, sites=map(s-> s in ukidsmeta.sample, sitenames(uaccessory)))

umoms = copy(view(umoms, species = map(row-> any(x-> x > 0, row), eachrow(occurrences(umoms)))))
ukids = copy(view(ukids, species = map(row-> any(x-> x > 0, row), eachrow(occurrences(ukids)))))

occ = occurrences(ukids)
@assert !any(row-> all(isequal(0.), row), eachrow(occ))

Looking for associations

If we just pulled a bunch of genes at random, we'd expect some of them to positively or negatively correlate with our subject covariates. Probably, they'd have a normal distribution of correlations.

agecors = cor(ukidsmeta.correctedAgeDays, occ, dims=2)'
agecors = [isnan(c) ? 0 : c for c in agecors]
agerank = sortperm(agecors)
describe(agecors)
Summary Stats:
Length:         664950
Missing Count:  0
Mean:           0.016642
Minimum:        -0.443329
1st Quartile:   -0.111237
Median:         0.012144
3rd Quartile:   0.126674
Maximum:        0.585687
Type:           Float64
itp = LinearInterpolation(eachindex(agecors), agecors[agerank])

plot(itp([range(1, stop=length(itp), length=100)]...),
    xlabel="rank", ylabel="Pearson correlation with age", label="all genes",
    color=color1[2], legend=:bottomright, title="UniRef90 age correlations")

Rather than doing a statistical test on each gene function directly, we can do a variant of "gene set enrichment analysis" (GSEA). Here, we use a statistical test called a Mann-Whitney U test to ask if a set of genes as a whole (say, antibiotic resistance genes) tend to be positively or negatively correlated with age.

If any given antibiotic resistance gene is as likely to be positively correlated with age as negatively correlated, we'd be less confident that any given correlation is "real", it could just be random. But if all or even most AbxR genes fall on one side, we can be more confident that there's something interesting to look at.

This set of AbxR genes comes from UniProt.

abxr = CSV.read("data/uniprot/uniprot-abxr.tsv")

features = match.(r"UniRef90_(\w+)", featurenames(ukids))
@assert length(features) == length(agecors)
@assert all(!isnothing, features)
features = [m.captures[1] for m in features]

searchset = Set(abxr.Entry)
abx_pos = findall(x-> x in searchset, features)
mwu = MannWhitneyUTest(agecors[abx_pos], agecors[Not(abx_pos)])
Approximate Mann-Whitney U test
-------------------------------
Population details:
    parameter of interest:   Location parameter (pseudomedian)
    value under h_0:         0
    point estimate:          -0.1882160770828409

Test summary:
    outcome with 95% confidence: reject h_0
    two-sided p-value:           <1e-36

Details:
    number of observations in each group: [116, 664834]
    Mann-Whitney-U statistic:             1.2209567e7
    rank sums:                            [1.2216353e7, 2.21067367372e11]
    adjustment for ties:                  23166.0
    normal approximation (μ, σ):          (-2.6350805e7, 2.0672347843504206e6)
m = round(median(agecors[abx_pos]), sigdigits=4)
p = round(pvalue(mwu), sigdigits=4)

plot(range(1, stop=length(itp), length=100), itp(range(1, stop=length(itp), length=100)),
    xlabel="rank", ylabel="Pearson correlation with age", legend=:bottomright,
    label="all genes", color=color1[2], title = "UniRef90 age correlations")
scatter!(invperm(agerank)[abx_pos], agecors[abx_pos], color=color1[3], label="abxR genes",
    legend=:bottomright)
annotate!(1000, 0.3, "median = $m\npvalue=$p", :left)

The following code can be uncommented to do a random permutation test to show that if genes are selected at random, the pvalues are uniformaly distributed as expected. The code here is not run because it takes a really long time.

# # test some random samples
# ps = Float64[]
#
# for _ in 1:5000
#     s = sample(1:length(agecors), 50)
#     push!(ps, pvalue(MannWhitneyUTest(agecors[s], agecors[Not(s)])))
# end
#
# histogram(ps, legend=false, title="Random gene subsets (5k)", xlabel="p-value", ylabel="number")

We can do the same test for carbohydrate metabolims genes from UniProt...

carbs = CSV.read("data/uniprot/uniprot-carbohydrate.tsv")
searchset = Set(carbs.Entry)
carbs_pos = findall(x-> x in searchset, features)
mwu = MannWhitneyUTest(agecors[carbs_pos], agecors[Not(carbs_pos)])
Approximate Mann-Whitney U test
-------------------------------
Population details:
    parameter of interest:   Location parameter (pseudomedian)
    value under h_0:         0
    point estimate:          -0.19776959266472663

Test summary:
    outcome with 95% confidence: reject h_0
    two-sided p-value:           <1e-80

Details:
    number of observations in each group: [261, 664689]
    Mann-Whitney-U statistic:             2.75828e7
    rank sums:                            [2.7616991e7, 2.21051966734e11]
    adjustment for ties:                  23166.0
    normal approximation (μ, σ):          (-5.91591145e7, 3.1005140108452165e6)
m = round(median(agecors[carbs_pos]), sigdigits=4)
p = round(pvalue(mwu), sigdigits=4)
plot(range(1, stop=length(itp), length=100), itp(range(1, stop=length(itp), length=100)),
    xlabel="rank", ylabel="Pearson correlation with age", legend=:bottomright,
        label="all genes", color=color1[2], title = "UniRef90 age correlations")

scatter!(invperm(agerank)[carbs_pos], agecors[carbs_pos], color=color1[4], label="Carbohydrate metabolism genes")
annotate!(1000, 0.3, "median = $m\npvalue=$p", :left)

... and fatty acid metabolism genes.

fa = CSV.read("data/uniprot/uniprot-fa.tsv")
searchset = Set(fa.Entry)
fa_pos = findall(x-> x in searchset, features)
mwu = MannWhitneyUTest(agecors[fa_pos], agecors[Not(fa_pos)])
Approximate Mann-Whitney U test
-------------------------------
Population details:
    parameter of interest:   Location parameter (pseudomedian)
    value under h_0:         0
    point estimate:          -0.18405607432865664

Test summary:
    outcome with 95% confidence: reject h_0
    two-sided p-value:           <1e-25

Details:
    number of observations in each group: [88, 664862]
    Mann-Whitney-U statistic:             1.01875625e7
    rank sums:                            [1.01914785e7, 2.210693922465e11]
    adjustment for ties:                  23166.0
    normal approximation (μ, σ):          (-1.90663655e7, 1.8005753097980688e6)
m = round(median(agecors[fa_pos]), sigdigits=4)
p = round(pvalue(mwu), sigdigits=4)
plot(range(1, stop=length(itp), length=100), itp(range(1, stop=length(itp), length=100)),
    xlabel="rank", ylabel="Pearson correlation with age", legend=:bottomright,
        label="all genes", color=color1[2], title = "UniRef90 age correlations")

scatter!(invperm(agerank)[fa_pos], agecors[fa_pos], color=color1[5], label="Fatty Acid metabolism genes")
annotate!(1000, 0.3, "median = $m\npvalue=$p", :left)

Reproducibility of previous findings

Vatanen et. al. 2018 investigated the metagenomes of a different cohort of children to test for associations with risk of developing Type I diabetes. In figure 3 of this paper, they showed a number of enzyme classes that were associated with the age of children. We can map these enzype classes (ECs) to UniRef90s as gene sets, and test if they are also associated with age in our cohort

ecs2uniref = Dict()
for line in eachline("data/engaging/ec2uniref90.txt")
    line = split(line, '\t')
    ecs2uniref[line[1]] = map(x-> String(match(r"UniRef90_(\w+)", x).captures[1]), line[2:end])
end
pos_control = let pc = []
    for ec in ("6.1.1.18", "2.8.4.4", "2.2.1.1", "2.7.1.144", "2.3.1.179", "5.4.2.12",
               "2.7.1.26", "2.7.1.11", "4.1.1.49", "2.6.1.83", "2.3.1.51", "1.7.99.1",
               "2.5.1.3", "1.7.99.1", "2.4.1.21", "2.5.1.3","4.1.99.17","2.7.1.90",
               "3.4.24.78","2.5.1.49","1.4.1.14", "4.1.1.3")
         searchset = Set(ecs2uniref[ec])
         pos = findall(x-> x in searchset, features)
         mwu = MannWhitneyUTest(agecors[pos], agecors[Not(pos)])
         push!(pc, (median(agecors[pos]), pvalue(mwu)))
    end
    pc
end

# what fraction of these genes are + correlated?
count(x-> x[1] > 0, pos_control) / length(pos_control)
0.9545454545454546
# what fraction of these genes are + correlated and significant?
count(x-> x[1] > 0 && x[2] < 0.01, pos_control) / length(pos_control)
0.7272727272727273
neg_control = let nc = []
    for ec in ("3.6.1.1","2.7.7.56","2.6.1.42","3.1.22.4","1.6.1.2","1.1.1.44","5.4.2.11",
               "6.3.4.18","6.3.1.2","2.3.1.117","5.3.1.6","4.2.1.1","2.7.7.72","2.5.1.74",
               "2.3.1.54","4.4.1.8","2.7.1.15","1.11.1.15","6.3.4.14")
       searchset = ecs2uniref[ec]
       pos = findall(x-> x in searchset, features)
       mwu = MannWhitneyUTest(agecors[pos], agecors[Not(pos)])
       push!(nc, (median(agecors[pos]), pvalue(mwu)))
   end
   nc
end

# what fraction of these genes are - correlated?
count(x-> x[1] < 0, neg_control) / length(neg_control)
0.6842105263157895
# what fraction of these genes are - correlated and significant?
count(x-> x[1] < 0 && x[2] < 0.01, neg_control) / length(neg_control)
0.2631578947368421

KOs

In Valles-Colomer et. al., 2019, a number of "neuroactive" microbial genes were identified. They are identified by their Kegg-Orthology (KO) codes, which we can also conveniently load:

kos_unstrat = "data/engaging/merged/batch1-10_ko_names_relab_unstratified.tsv"
@assert samples_from_file(kos_unstrat) == samples_from_file(uniref_unstrat)

kos = CSV.read(kos_unstrat)
names!(kos, map(normalize_colname, names(kos)))
kos = abundancetable(kos)


ukos = view(kos, sites=getfield.([ukidsamples; umomsamples], :sample))

kos_moms = view(ukos, sites=getfield.(umomsamples, :sample))
kos_kids = view(ukos, sites = map(s-> s in ukidsmeta.sample, sitenames(ukos)))


@assert nsamples(kos_kids) == nrow(ukidsmeta)
@assert nsamples(kos_moms) == nrow(umomsmeta)

Getting neuroactive KOs

The supplementary data from that paper is in a slightly odd format, but can be easily parsed to pull out the codes we want

neuroactive = Dict()

let (mgb, desc) = ("", "")
    for line in eachline("data/uniprot/gbm.txt")
        line = split(line, r"[\t,]")
        if startswith(line[1], "MGB")
            (mgb, desc) = line
            desc = rstrip(replace(desc, r"\bI+\b.*$"=>""))
            if desc in keys(neuroactive)
                push!(neuroactive[desc].mgbs, mgb)
            else
                neuroactive[desc] = (mgbs=[mgb], kos=String[])
            end
        else
            filter!(l-> occursin(r"^K\d+$", l), line)
            append!(neuroactive[desc].kos, String.(line))
        end
    end
end

And we can convert those KOs to UniRef90s to get our gene set.

kos2uniref = Dict()
for line in eachline("data/engaging/ko2uniref90.txt")
    line = split(line, '\t')
    kos2uniref[line[1]] = map(x-> String(match(r"UniRef90_(\w+)", x).captures[1]), line[2:end])
end


na_uniref = let urefs = []
    for (key, value) in neuroactive
        for k in value.kos
            if k in keys(kos2uniref)
                append!(urefs, kos2uniref[k])
            end
        end
    end
    Set{String}(urefs)
end

neuroactive_pos = findall(f-> f in na_uniref, features)
mwu = MannWhitneyUTest(agecors[neuroactive_pos], agecors[Not(neuroactive_pos)])
Approximate Mann-Whitney U test
-------------------------------
Population details:
    parameter of interest:   Location parameter (pseudomedian)
    value under h_0:         0
    point estimate:          -0.06821803662124867

Test summary:
    outcome with 95% confidence: reject h_0
    two-sided p-value:           <1e-4

Details:
    number of observations in each group: [1290, 663660]
    Mann-Whitney-U statistic:             4.00411005e8
    rank sums:                            [4.012437e8, 2.20678340025e11]
    adjustment for ties:                  23166.0
    normal approximation (μ, σ):          (-2.7649695e7, 6.887662769107258e6)
m = round(median(agecors[neuroactive_pos]), sigdigits=4)
p = round(pvalue(mwu), sigdigits=3)

plot(range(1, stop=length(itp), length=100), itp(range(1, stop=length(itp), length=100)),
    xlabel="rank", ylabel="Pearson correlation with age", legend=:bottomright,
        label="all genes", color=color1[2], title = "UniRef90 age correlations")

scatter!(invperm(agerank)[neuroactive_pos], agecors[neuroactive_pos],
    markersize=4, color=color1[5],
    label="All neuroactive genes")
annotate!(1000, 0.3, "median = $m\npvalue=$p", :left)
na = view(unirefs, species=neuroactive_pos)
uniref_meta.neuroactive_sum = sum(occurrences(na), dims=1) |> vec
uniref_meta.identified_sum = sum(occurrences(unirefs)[3:end, :], dims=1) |> vec

@assert all(map(x-> isapprox(1., x, rtol=8), sum(occurrences(unirefs), dims=1) |> vec))

dm = pairwise(BrayCurtis(), unirefs)
mds = fit(MDS, dm, distances=true)
plot(mds, zcolor=log.(uniref_meta.identified_sum), color=:solar,
    title="Identified genes, all samples", primary=false)
plot(mds, zcolor=log.(uniref_meta.neuroactive_sum ./ uniref_meta.identified_sum), color=:solar,
    title="Neuroactive genes, all samples", primary=false)
dm = pairwise(BrayCurtis(), accessory)
mds = fit(MDS, dm, distances=true)
plot(mds, zcolor=log.(uniref_meta.identified_sum), color=:solar,
    title="Identified genes, all samples", primary=false)
plot(mds, zcolor=log.(uniref_meta.neuroactive_sum ./ uniref_meta.identified_sum), color=:solar,
    title="Neuroactive genes, all samples", primary=false)
kids_napos = findall(f-> f in na_uniref, features)
nakids = view(ukids, species=kids_napos)
ukidsmeta.neuroactive_sum = sum(occurrences(nakids), dims=1) |> vec
ukidsmeta.total_accessory = sum(occurrences(ukids), dims=1) |> vec
dm = pairwise(BrayCurtis(), ukids)
mds = fit(MDS, dm, distances=true)

plot(mds, zcolor=log.(ukidsmeta.neuroactive_sum ./ ukidsmeta.total_accessory), color=:solar,
    title="Neuroactive genes, kids samples", primary=false)
plot(mds, group=ukidsmeta.ageLabel, color=color3',
    title="Kids unique gene functions")

Checking all of the neuroactive categories in our childhood cohort:

nadf = DataFrame(description=String[], n_unirefs = Int[], median=Float64[], pvalue=Float64[], idx=[])

for (desc, (mgbs, kos)) in neuroactive
    urs = []
    for ko in kos
        ko in keys(kos2uniref) && append!(urs, kos2uniref[ko])
    end
    urs = Set(urs)
    pos = findall(f-> f in urs, features)

    n = length(pos)
    n > 2 || continue

    mwu = MannWhitneyUTest(agecors[pos], agecors[Not(pos)])
    m = round(median(agecors[pos]), sigdigits=4)
    p = round(pvalue(mwu), sigdigits=4)

    push!(nadf, (desc, n, m, p, pos))
    plt = plot(range(1, stop=length(itp), length=100), itp(range(1, stop=length(itp), length=100)),
        xlabel="rank", ylabel="Pearson correlation with age", legend=:bottomright,
            label="all genes", color=color1[2], title = "UniRef90 age correlations")

    scatter!(invperm(agerank)[pos], agecors[pos],
        markersize=4, color=color1[1],
        label="$desc")
    annotate!(1000, 0.3, "n = $n\nmedian = $m\npvalue=$p", :left)
    display(plt)
    savefig(joinpath(figures, "uniref90_age_correlations_neuroactive_$(replace(desc, r"[^\w]+"=>'-')).svg"))
end
nadf.qvalue = adjust(nadf.pvalue, BenjaminiHochberg())
@pt nadf
┌─────────────────────────────────────────┬───────────┬────────┬────────┬───────────────────────────────────────────── ⋯
│                             description │ n_unirefs │ median │ pvalue │                                              ⋯
├─────────────────────────────────────────┼───────────┼────────┼────────┼───────────────────────────────────────────── ⋯
│                  Propionate degradation │         5 │ -0.347 │    0.0 │                                              ⋯
│                          GABA synthesis │        27 │ -0.136 │    0.0 │                                              ⋯
│               Isovaleric acid synthesis │        83 │ -0.066 │  0.262 │                                              ⋯
│           17-beta-Estradiol degradation │        27 │ -0.024 │  0.732 │                                              ⋯
│                     Glutamate synthesis │        96 │  0.083 │   0.01 │                                              ⋯
│                       Acetate synthesis │       193 │ -0.009 │  0.334 │                                              ⋯
│             Quinolinic acid degradation │       113 │ -0.062 │  0.357 │                                              ⋯
│                     Acetate degradation │        11 │  0.144 │  0.404 │                                              ⋯
│                      p-Cresol synthesis │        35 │ -0.002 │  0.919 │                                              ⋯
│                      Inositol synthesis │        34 │ -0.068 │  0.149 │                                              ⋯
│                   Glutamate degradation │        25 │ -0.116 │    0.0 │                                              ⋯
│                Nitric oxide degradation │         4 │ -0.185 │  0.003 │                                              ⋯
│                    ⋮                    │     ⋮     │   ⋮    │   ⋮    │                                              ⋯
└─────────────────────────────────────────┴───────────┴────────┴────────┴───────────────────────────────────────────── ⋯
boxplot([agecors[pos] for pos in (abx_pos, carbs_pos, fa_pos, 1:length(agecors))], color=:lightgrey,
    legend=false, xticks=(1:4, ["abx", "carbs", "fa", "all"]),
    ylabel="Median Age Correlation")
sort!(nadf, :median)
plt = boxplot([agecors[pos] for pos in nadf.idx], color=:lightgrey,
    legend=false, xticks=(1:nrow(nadf), nadf.description), xrotation=45,
    ylabel="Median Age Correlation")
for (i, row) in enumerate(eachrow(nadf))
    if row.qvalue < 0.005
        annotate!(i, 0.32, "**", align=:center)
    # elseif row.qvalue < 0.01
    #     annotate!(i, 0.32, "**", align=:center)
elseif row.qvalue < 0.05
        annotate!(i, 0.32, "*", align=:center)
    end
end
display(plt)
nacols = [:sample, :subject, :timepoint, :neuroactive_sum]
@assert length(features) == nfeatures(ukids)

for row in eachrow(nadf)
    nabt = view(ukids, species=row.idx)
    col = sum(occurrences(nabt), dims=1) |> vec
    any(x-> x > 0, col) || continue
    @info row.description
    @show length(col)
    push!(nacols, Symbol(row.description))
    ukidsmeta[!, Symbol(row.description)] = col
end
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
length(col) = 259
@pt ukidsmeta[!, nacols]
┌─────────────┬─────────┬───────────┬─────────────────┬────────────────────────┬──────────────────┬─────────────────── ⋯
│      sample │ subject │ timepoint │ neuroactive_sum │ Propionate degradation │ GABA degradation │ Nitric oxide degra ⋯
├─────────────┼─────────┼───────────┼─────────────────┼────────────────────────┼──────────────────┼─────────────────── ⋯
│ C0005_3F_1A │       5 │         3 │           0.001 │                    0.0 │              0.0 │                    ⋯
│ C0016_3F_1A │      16 │         3 │           0.001 │                    0.0 │              0.0 │                    ⋯
│ C0017_2F_1A │      17 │         2 │           0.001 │                    0.0 │              0.0 │                    ⋯
│ C0032_9F_1A │      32 │         9 │           0.001 │                    0.0 │              0.0 │                    ⋯
│ C0043_7F_1A │      43 │         7 │           0.001 │                    0.0 │              0.0 │                    ⋯
│ C0047_7F_1A │      47 │         7 │           0.001 │                    0.0 │              0.0 │                    ⋯
│ C0052_5F_1A │      52 │         5 │           0.001 │                    0.0 │              0.0 │                    ⋯
│ C0053_6F_1A │      53 │         6 │           0.001 │                    0.0 │              0.0 │                    ⋯
│ C0058_4F_1A │      58 │         4 │           0.001 │                    0.0 │              0.0 │                    ⋯
│ C0061_6F_1A │      61 │         6 │           0.001 │                    0.0 │              0.0 │                    ⋯
│ C0062_6F_1A │      62 │         6 │           0.001 │                    0.0 │              0.0 │                    ⋯
│ C0066_6F_1A │      66 │         6 │           0.001 │                    0.0 │              0.0 │                    ⋯
│      ⋮      │    ⋮    │     ⋮     │        ⋮        │           ⋮            │        ⋮         │            ⋮       ⋯
└─────────────┴─────────┴───────────┴─────────────────┴────────────────────────┴──────────────────┴─────────────────── ⋯

Cognitive Scores

That was correlation with age, which is interesting, but we'd also like to know if these microbial genes are associated with neurocognitive features in our kiddos.

cog_filter = map(row-> row.correctedAgeDays > 365 && !ismissing(row.cogScore), eachrow(ukidsmeta))
cogmeta = ukidsmeta[cog_filter, :]
cog = view(ukids, sites=cog_filter)
cog = copy(view(cog, species=map(row->any(>(0.), row), eachrow(occurrences(cog)))))

occ = occurrences(cog)

cogcors = cor(cogmeta.cogScore, occ, dims=2)'
@assert !any(isnan, cogcors)
cogrank = sortperm(cogcors)
describe(cogcors)
Summary Stats:
Length:         637965
Missing Count:  0
Mean:           -0.012269
Minimum:        -0.343552
1st Quartile:   -0.070157
Median:         -0.011670
3rd Quartile:   0.047087
Maximum:        0.323392
Type:           Float64
itp = LinearInterpolation(eachindex(cogcors), cogcors[cogrank])

plot(range(1, stop=length(itp), length=100), itp(range(1, stop=length(itp), length=100)),
    markersize=1, xlabel="rank", ylabel="Pearson correlation with Cognitive Score",
    label="all genes", color=color1[2], title="UniRef90 cogScore correlations", legend=:bottomright)
abxr = CSV.read("data/uniprot/uniprot-abxr.tsv")

features = match.(r"UniRef90_(\w+)", featurenames(cog))
@assert !any(isnothing, features)
features = [f.captures[1] for f in features]

searchset = Set(abxr.Entry)
abx_pos = findall(x-> x in searchset, features)
mwu = MannWhitneyUTest(cogcors[abx_pos], cogcors[Not(abx_pos)])
Approximate Mann-Whitney U test
-------------------------------
Population details:
    parameter of interest:   Location parameter (pseudomedian)
    value under h_0:         0
    point estimate:          -0.06324039591272557

Test summary:
    outcome with 95% confidence: reject h_0
    two-sided p-value:           <1e-13

Details:
    number of observations in each group: [109, 637856]
    Mann-Whitney-U statistic:             2.04536835e7
    rank sums:                            [2.04596785e7, 2.034795299165e11]
    adjustment for ties:                  1.4163327e8
    normal approximation (μ, σ):          (-1.43094685e7, 1.9225724527628175e6)
m = round(median(cogcors[abx_pos]), sigdigits=4)
p = round(pvalue(mwu), sigdigits=4)

plot(range(1, stop=length(itp), length=100), itp(range(1, stop=length(itp), length=100)),
    xlabel="rank", ylabel="Pearson correlation with cogscore", legend=:bottomright,
    label="all genes", color=color1[2], title = "UniRef90 cogscore correlations")

scatter!(invperm(cogrank)[abx_pos], cogcors[abx_pos], color=color1[3], label="abxR genes")
annotate!(1000, 0.2, "median = $m\npvalue=$p", :left)
carbs = CSV.read("data/uniprot/uniprot-carbohydrate.tsv")
searchset = Set(carbs.Entry)
carbs_pos = findall(x-> x in searchset, features)
mwu = MannWhitneyUTest(cogcors[carbs_pos], cogcors[Not(carbs_pos)])
Approximate Mann-Whitney U test
-------------------------------
Population details:
    parameter of interest:   Location parameter (pseudomedian)
    value under h_0:         0
    point estimate:          -0.07296337978438688

Test summary:
    outcome with 95% confidence: reject h_0
    two-sided p-value:           <1e-27

Details:
    number of observations in each group: [236, 637729]
    Mann-Whitney-U statistic:             4.3882717e7
    rank sums:                            [4.3910683e7, 2.03456078912e11]
    adjustment for ties:                  1.4163327e8
    normal approximation (μ, σ):          (-3.1369305e7, 2.8286696355078514e6)
m = round(median(cogcors[carbs_pos]), sigdigits=4)
p = round(pvalue(mwu), sigdigits=4)
plot(range(1, stop=length(itp), length=100), itp(range(1, stop=length(itp), length=100)),
    xlabel="rank", ylabel="Pearson correlation with cogscore", legend=:bottomright,
    label="all genes", color=color1[2], title = "UniRef90 cogscore correlations")
scatter!(invperm(cogrank)[carbs_pos], cogcors[carbs_pos], color=color1[4], label="Carbohydrate metabolism genes")
annotate!(1000, 0.2, "median = $m\npvalue=$p", :left)
fa = CSV.read("data/uniprot/uniprot-fa.tsv")
searchset = Set(fa.Entry)
fa_pos = findall(x-> x in searchset, features)
mwu = MannWhitneyUTest(cogcors[fa_pos], cogcors[Not(fa_pos)])
Approximate Mann-Whitney U test
-------------------------------
Population details:
    parameter of interest:   Location parameter (pseudomedian)
    value under h_0:         0
    point estimate:          -0.07197820460265598

Test summary:
    outcome with 95% confidence: reject h_0
    two-sided p-value:           <1e-10

Details:
    number of observations in each group: [79, 637886]
    Mann-Whitney-U statistic:             1.44081695e7
    rank sums:                            [1.44113295e7, 2.034855782655e11]
    adjustment for ties:                  1.4163327e8
    normal approximation (μ, σ):          (-1.07883275e7, 1.6367909862662042e6)
m = round(median(cogcors[fa_pos]), sigdigits=4)
p = round(pvalue(mwu), sigdigits=4)
plot(range(1, stop=length(itp), length=100), itp(range(1, stop=length(itp), length=100)),
    xlabel="rank", ylabel="Pearson correlation with cogscore", legend=:bottomright,
    label="all genes", color=color1[2], title = "UniRef90 cogscore correlations")
scatter!(invperm(cogrank)[fa_pos], cogcors[fa_pos], color=color1[5], label="Fatty Acid metabolism genes")
annotate!(1000, 0.2, "median = $m\npvalue=$p", :left)
cognadf = DataFrame(description=String[], n_unirefs = Int[], median=Float64[], pvalue=Float64[], idx=[])

for (desc, (mgbs, kos)) in neuroactive
    urs = []
    for ko in kos
        ko in keys(kos2uniref) && append!(urs, kos2uniref[ko])
    end
    urs = Set(urs)
    pos = findall(f-> f in urs, features)

    n = length(pos)
    n > 2 || continue

    mwu = MannWhitneyUTest(cogcors[pos], cogcors[Not(pos)])
    m = round(median(cogcors[pos]), sigdigits=4)
    p = round(pvalue(mwu), sigdigits=4)

    push!(cognadf, (desc, n, m, p, pos))
    plt = plot(range(1, stop=length(itp), length=100), itp(range(1, stop=length(itp), length=100)),
        xlabel="rank", ylabel="Pearson correlation with cogscore", legend=:bottomright,
        label="all genes", color=color1[2], title = "UniRef90 cogscore correlations")
    scatter!(invperm(cogrank)[pos], cogcors[pos],
        markersize=4, color=color1[1],
        label="$desc")
    annotate!(1000, 0.2, "n = $n\nmedian = $m\npvalue=$p", :left)
    display(plt)
    savefig(joinpath(figures, "uniref90_cog_correlations_neuroactive_$(replace(desc, r"[^\w]+"=>'-')).svg"))
end
cognadf.qvalue = adjust(cognadf.pvalue, BenjaminiHochberg())
@pt cognadf
boxplot([cogcors[pos] for pos in (abx_pos, carbs_pos, fa_pos, 1:length(cogcors))], color=:lightgrey,
    legend=false, xticks=(1:4, ["abx", "carbs", "fa", "all"]),
    ylabel="Median cogScore Correlation")
sort!(cognadf, :median)
plt = boxplot([cogcors[pos] for pos in cognadf.idx], color=:lightgrey,
    legend=false, xticks=(1:nrow(nadf), cognadf.description), xrotation=45,
    ylabel="Median cogScore Correlation")
for (i, row) in enumerate(eachrow(cognadf))
    if row.qvalue < 0.005
        annotate!(i, 0.15, "**", align=:center)
    # elseif row.qvalue < 0.01
    #     annotate!(i, 0.22, "**", align=:center)
elseif row.qvalue < 0.05
        annotate!(i, 0.15, "*", align=:center)
    end
end
display(plt)

Pfams

# pfam = CSV.read("data/engaging/merged/batch1-10_pfam_names_relab_unstratified.tsv")
# names!(pfam, map(n-> Symbol(resolve_sampleID(replace(String(n), r"S\d+_Abundance-RELAB"=>"")).sample), names(pfam)))
# pfam_abt = abundancetable(pfam)
# upfam = view(pfam_abt, sites=getfield.([kidsamples; momsamples], :sample))
#
# pfam_moms = view(upfam, sites=getfield.(momsamples, :sample))
# pfam_kids = view(upfam, sites = map(s-> s in kidsmeta.sample, sitenames(upfam)))
#
# @assert nsamples(kos_kids) == nrow(kidsmeta)
# @assert nsamples(kos_moms) == nrow(momsmeta)
# @assert all(i-> samplenames(upfam)[i] == umeta.sample[i], nsamples(upfam))

Compare to taxonomic profiles

tax = load_taxonomic_profiles()
taxfilter!(tax)
tax = abundancetable(tax)
relativeabundance!(tax)
utax = view(tax, sites=getfield.([ukidsamples; umomsamples], :sample))

@assert samplenames(utax) == samplenames(uaccessory)

funcdm = pairwise(BrayCurtis(), uaccessory)
funcmds = fit(MDS, funcdm, distances=true)
taxdm = pairwise(BrayCurtis(), utax)
taxmds = fit(MDS, taxdm, distances=true)
scatter(projection(taxmds)[:,1], projection(funcmds)[:,1],
    group=umeta.ageLabel, color=color3',
    xlabel="Taxonomic MDS1", ylabel="Accessory genes MDS1")
scatter(projection(taxmds)[:,2], projection(funcmds)[:,2],
    group=umeta.ageLabel, color=color3', legend=:bottomright,
    xlabel="Taxonomic MDS2", ylabel="Accessory genes MDS2")
ufeatures = match.(r"UniRef90_(\w+)", featurenames(uaccessory))
@assert !any(isnothing, ufeatures)
ufeatures = [f.captures[1] for f in ufeatures]

neuroactive_pos = findall(f-> f in na_uniref, ufeatures)
na = view(uaccessory, species=neuroactive_pos)
umeta.neuroactive_sum = sum(occurrences(na), dims=1) |> vec
umeta.total_sum = sum(occurrences(uaccessory), dims=1) |> vec
scatter(projection(taxmds)[:,1], projection(funcmds)[:,1],
    zcolor=umeta.neuroactive_sum ./ umeta.total_sum, color=:solar, primary=false,
    xlabel="Taxonomic MDS1", ylabel="Accessory genes MDS1", title="Neuroactive / total accessory")
scatter(projection(taxmds)[:,2], projection(funcmds)[:,2],
    zcolor=umeta.neuroactive_sum ./ umeta.total_sum, color=:solar, primary=false,
    xlabel="Taxonomic MDS2", ylabel="Accessory genes MDS2", title="Neuroactive / total accessory")
pcopri = occurrences(view(utax, species=["Prevotella_copri"]))|> vec
scatter(projection(taxmds)[:,1], projection(funcmds)[:,1],
    zcolor=log.(pcopri .+ 1e-4), color=:plasma, primary=false,
    xlabel="Taxonomic MDS1", ylabel="Accessory genes MDS1")
scatter(projection(taxmds)[:,2], projection(funcmds)[:,2],
    zcolor=log.(pcopri .+ 1e-4), color=:plasma, primary=false,
    xlabel="Taxonomic MDS2", ylabel="Accessory genes MDS2")
αdiv = shannon(utax)
scatter(projection(taxmds)[:,1], projection(funcmds)[:,1],
    zcolor=αdiv, color=:viridis, primary=false,
    xlabel="Taxonomic MDS1", ylabel="Accessory genes MDS1")
scatter(projection(taxmds)[:,2], projection(funcmds)[:,2],
    zcolor=αdiv, color=:viridis, primary=false,
    xlabel="Taxonomic MDS2", ylabel="Accessory genes MDS2")